# Initializing the environment
import pandas as pd
import numpy as np
# I dont like warnings
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
# Graphics in SVG format are more sharp and legible
%config InlineBackend.figure_format = 'svg'
# Loading daaset
cancer = pd.read_csv("D:/MachineLearning/exercises-linear-regression-exercise-1/original/cancer_reg.csv")
household = pd.read_csv("D:/MachineLearning/exercises-linear-regression-exercise-1/original/avg-household-size.csv")
Merging the two data files on the basis of geography
new_cancer = pd.merge(cancer, household, on='geography')
new_cancer.columns
cancer.columns
Lets first look at the data
cancer.head()
cancer.tail()
Here, target_deathrate is the target variable that we have to predict
cancer.describe()
# regex - geography
cancer.geography.head(2)
# new data frame with split value columns
a = new_cancer["geography"].str.split(",", n = 1, expand = True)
new_cancer['County'] = a[0]
new_cancer['State'] = a[1]
?str.split
new_cancer.columns
cancer.head(1)
import plotly.plotly as py
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode,plot, iplot
import cufflinks as cf
init_notebook_mode(connected=True)
cf.go_offline()
grouped = new_cancer.groupby(['State','statefips'])
new_data = grouped['target_deathrate'].agg({'target_deathrate':sum})
new_data = pd.DataFrame(new_data)
new_data = new_data.reset_index()
new_data.head(10)
new_data.info()
states = pd.read_csv('D:/MachineLearning/states.csv')
states.head(3)
states.info()
states.shape
new_data['abbreviation'] = states['Abbreviation']
# Joining State and new_data
new_data.to_csv("D:/MachineLearning/new_data.csv")
new_data = pd.read_csv("D:/MachineLearning/new_data.csv")
new_data.head(10)
new_data.shape
data = dict(type = 'choropleth',
colorscale = 'Portland',
locations = new_data['abbreviation'],
z = new_data["target_deathrate"],
locationmode = "USA-states",
colorbar = {'title':'Target Death Rate'})
layout = dict(title = "Target death rate due to cancer in the United States",
geo = dict(scope = 'usa',
showlakes = True
))
choromap = dict(data=[data],layout=layout)
iplot(choromap)
The above geomap represents distribution of target death rates across United States. From the graph, the gradient shows that Texas has the highest death rate due to cancer.
Boxplots to check for outliers for the target variable
cancer['target_deathrate'].plot(subplots = True, kind = 'box', layout = (1,1),figsize = [7,7])
plt.show()
There are no outliers in the target variables
Lets check the distributon of the target variable, target deathrate
sns.distplot(cancer['target_deathrate'],color="Green")
plt.xlabel('Target DeathRate')
plt.ylabel('Count')
plt.title('Histogram of Target DeathRate')
plt.show()
Since the target variable is normally distributed, we can directly apply linear regression.
Multivariate plots allow us to see relationships between two and more different variables, all in one figure.
# Scatter Plot
cancer.plot(kind='scatter', x='incidencerate', y='target_deathrate', alpha=0.2)
plt.xlabel('Incidence Rate')
plt.ylabel('Death Rate')
plt.title('Incidence Rate vs Death Rate')
plt.show()
cancer.plot(kind='scatter', x='medincome', y='target_deathrate', alpha=0.2)
plt.xlabel('Median Income')
plt.ylabel('Death Rate')
plt.title('Median Income vs Death Rate')
plt.show()
There is a slight negative correlation between cancer death rate and median income.